library(MASS)
data(Boston)
str(Boston)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
boston_scaled = scale(Boston)
str(boston_scaled)
## num [1:506, 1:14] -0.419 -0.417 -0.417 -0.416 -0.412 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:506] "1" "2" "3" "4" ...
## ..$ : chr [1:14] "crim" "zn" "indus" "chas" ...
## - attr(*, "scaled:center")= Named num [1:14] 3.6135 11.3636 11.1368 0.0692 0.5547 ...
## ..- attr(*, "names")= chr [1:14] "crim" "zn" "indus" "chas" ...
## - attr(*, "scaled:scale")= Named num [1:14] 8.602 23.322 6.86 0.254 0.116 ...
## ..- attr(*, "names")= chr [1:14] "crim" "zn" "indus" "chas" ...
boston_scaled <- as.data.frame(boston_scaled)
bins <- quantile(boston_scaled$crim)
bins
## 0% 25% 50% 75% 100%
## -0.419366929 -0.410563278 -0.390280295 0.007389247 9.924109610
crime <- cut(boston_scaled$crim, breaks = bins, include.lowest = TRUE, labels = c("low", "med_low", "med_high", "high"))
boston_scaled <- dplyr::select(boston_scaled, -crim)
boston_scaled <- data.frame(boston_scaled, crime)
n <- nrow(boston_scaled)
# choose randomly 80% of the rows
ind <- sample(n, size = n * 0.8)
# create train set
train <- boston_scaled[ind,]
# create test set
test <- boston_scaled[-ind,]
# save the correct classes from test data
correct_classes <- test$crime
# remove the crime variable from test data
test <- dplyr::select(test, -crime)
# LDA
lda.fit <- lda(crime ~ ., data = train)
# print the lda.fit object
lda.fit
## Call:
## lda(crime ~ ., data = train)
##
## Prior probabilities of groups:
## low med_low med_high high
## 0.2574257 0.2326733 0.2623762 0.2475248
##
## Group means:
## zn indus chas nox rm age
## low 0.8588545 -0.8980181 -0.12090214 -0.8444985 0.47282047 -0.8022219
## med_low -0.0518548 -0.3134681 0.06274329 -0.6043089 -0.07656227 -0.4435540
## med_high -0.3966319 0.2589551 0.21052285 0.4037540 -0.01564562 0.4084285
## high -0.4872402 1.0171519 -0.07547406 1.0287999 -0.41182369 0.8034463
## dis rad tax ptratio black lstat
## low 0.8271116 -0.7024845 -0.7535811 -0.3929549 0.37757208 -0.75178515
## med_low 0.4402709 -0.5444763 -0.4710934 -0.1381793 0.34131759 -0.20857490
## med_high -0.3697539 -0.4282234 -0.2884499 -0.2413524 0.04496343 0.06298321
## high -0.8547941 1.6377820 1.5138081 0.7803736 -0.94029858 0.91353880
## medv
## low 0.53213671
## med_low 0.04721214
## med_high 0.07397994
## high -0.72444611
##
## Coefficients of linear discriminants:
## LD1 LD2 LD3
## zn 0.098781533 0.525306859 -0.99942228
## indus 0.029911155 -0.333004741 0.39681899
## chas -0.084237148 -0.074520107 0.18346408
## nox 0.352927842 -0.820834305 -1.19899518
## rm -0.086810309 -0.009511551 -0.09432911
## age 0.229795060 -0.210561205 -0.35902832
## dis -0.062562559 -0.165654465 0.22071162
## rad 3.156749857 0.978399363 -0.14491531
## tax 0.006663194 0.016130427 0.69239722
## ptratio 0.139205135 -0.063029371 -0.42187778
## black -0.133174185 0.014375361 0.16161958
## lstat 0.244485522 -0.231722183 0.35534634
## medv 0.199973676 -0.405706802 -0.21375527
##
## Proportion of trace:
## LD1 LD2 LD3
## 0.9518 0.0362 0.0120
# the function for lda biplot arrows
lda.arrows <- function(x, myscale = 1, arrow_heads = 0.1, color = "orange", tex = 0.75, choices = c(1,2)){
heads <- coef(x)
arrows(x0 = 0, y0 = 0,
x1 = myscale * heads[,choices[1]],
y1 = myscale * heads[,choices[2]], col=color, length = arrow_heads)
text(myscale * heads[,choices], labels = row.names(heads),
cex = tex, col=color, pos=3)
}
# target classes as numeric
classes <- as.numeric(train$crime)
# plot the lda results
plot(lda.fit, dimen = 2, col = classes, pch = classes)
lda.arrows(lda.fit, myscale = 1)

# predict classes with test data
lda.pred <- predict(lda.fit, newdata = test)
# cross tabulate the results
table(correct = correct_classes, predicted = lda.pred$class)
## predicted
## correct low med_low med_high high
## low 19 4 0 0
## med_low 3 18 11 0
## med_high 0 1 18 1
## high 0 0 0 27
# load MASS and Boston
library(MASS)
library(ggplot2)
data('Boston')
Boston <- scale(Boston)
# euclidean distance matrix
dist_eu <- dist(Boston)
# look at the summary of the distances
summary(dist_eu)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1343 3.4625 4.8241 4.9111 6.1863 14.3970
# manhattan distance matrix
dist_man <- dist(Boston, method = 'manhattan')
# look at the summary of the distances
summary(dist_man)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2662 8.4832 12.6090 13.5488 17.7568 48.8618
# k-means clustering
km <-kmeans(Boston, centers = 3)
# plot the Boston dataset with clusters
pairs(Boston, col = km$cluster)

set.seed(123)
# determine the number of clusters
k_max <- 10
# calculate the total within sum of squares
twcss <- sapply(1:k_max, function(k){kmeans(Boston, k)$tot.withinss})
# visualize the results
qplot(x = 1:k_max, y = twcss, geom = 'line')

data(Boston)
Boston <- as.data.frame(scale(Boston))
km <- kmeans(Boston, centers = 5)
Boston$km <- km$cluster
lda.fit <- lda(km~., Boston)
classes <- as.numeric(Boston$km)
plot(lda.fit, dimen = 2, col = classes, pch = classes)
lda.arrows(lda.fit, myscale = 1)

library(MASS)
data(Boston)
boston_scaled = scale(Boston)
boston_scaled <- as.data.frame(boston_scaled)
bins <- quantile(boston_scaled$crim)
crime <- cut(boston_scaled$crim, breaks = bins, include.lowest = TRUE, labels = c("low", "med_low", "med_high", "high"))
boston_scaled <- dplyr::select(boston_scaled, -crim)
boston_scaled <- data.frame(boston_scaled, crime)
n <- nrow(boston_scaled)
ind <- sample(n, size = n * 0.8)
train <- boston_scaled[ind,]
test <- boston_scaled[-ind,]
correct_classes <- test$crime
test <- dplyr::select(test, -crime)
# LDA
lda.fit <- lda(crime ~ ., data = train)
model_predictors <- dplyr::select(train, -crime)
# check the dimensions
dim(model_predictors)
## [1] 404 13
dim(lda.fit$scaling)
## [1] 13 3
# matrix multiplication
matrix_product <- as.matrix(model_predictors) %*% lda.fit$scaling
matrix_product <- as.data.frame(matrix_product)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3, type= 'scatter3d', mode='markers', color = as.numeric(train$crime))
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.